# ==============================================================================
# name: RQ4-repeated-participation.R
# date:	Jan 25, 2022
# author: Bernhard Clemm / Tiago Ventura
# purpose: Quantify attempts at repeated participation and compare professionals and non-professionals
# ==============================================================================

rm(list = ls())

source("code/utils/styles.R")
source("code/utils/functions.R")

# READ IN, FILTER AND BIND DATA ================================================

fb <- read.csv("data/analysis_FB.csv")

lu <- read.csv("data/analysis_LU.csv")

yg <- read.csv("data/analysis_YG.csv")

repeated_fb <- fb %>%
  mutate(person_id = as.character(person_id)) %>%
  select(dataset, person_id, weight, age_high, white, party_bin,
         starts_with("professional_"), n_days_active,
         starts_with("count_repeated_urls_"), starts_with("count_repeated_urls_"), 
         starts_with("any_repeated_"), starts_with("prop_repeated_")) %>%
  # this was measured across waves, so we can deduplicate 
  distinct(person_id, .keep_all = T) %>%
  filter(n_days_active >= 7) %>%
  # fill in 0 for those people without repeated participation, but with browsing data
  mutate(across(c(starts_with("count_repeated_urls_"), starts_with("count_repeated_urls_"), 
                  starts_with("any_repeated_"), starts_with("prop_repeated_")),
                ~ ifelse(is.na(.), 0, .)))
  
repeated_lu <- lu %>%
  mutate(person_id = as.character(person_id)) %>%
  select(dataset, person_id, weight, age_high, white, party_bin,, 
         starts_with("professional_"), n_days_active,
         starts_with("count_repeated_urls_"), starts_with("count_repeated_urls_"), 
         starts_with("any_repeated_"), starts_with("prop_repeated_")) %>%
  # this was measured across waves, so we can deduplicate 
  distinct(person_id, .keep_all = T) %>%
  filter(n_days_active >= 7) %>%
  # fill in 0 for those people without repeated participation, but with browsing data
  mutate(across(c(starts_with("count_repeated_urls_"), starts_with("count_repeated_urls_"), 
                  starts_with("any_repeated_"), starts_with("prop_repeated_")),
                ~ ifelse(is.na(.), 0, .)))

repeated_yg <- yg %>%
  mutate(person_id = as.character(person_id)) %>%
  select(dataset, person_id, weight, age_high, white, party_bin,
         starts_with("professional_"), n_days_active,
         starts_with("count_repeated_urls_"), starts_with("count_repeated_urls_"), 
         starts_with("any_repeated_"), starts_with("prop_repeated_")) %>%
  # this was measured across waves, so we can deduplicate 
  distinct(person_id, .keep_all = T) %>%
  filter(n_days_active >= 7) %>%
  # fill in 0 for those people without repeated participation, but with browsing data
  mutate(across(c(starts_with("count_repeated_urls_"), starts_with("count_repeated_urls_"), 
                  starts_with("any_repeated_"), starts_with("prop_repeated_")),
                ~ ifelse(is.na(.), 0, .)))

repeated <- bind_rows(repeated_fb, repeated_lu, repeated_yg) %>%
  mutate(weight = ifelse(is.na(weight), 1, weight)) %>%
  as_survey_design(weights = weight) 

# MAIN PAPER ===================================================================

## Table 3: Repeated questionnaire participation ####

repeated_1h_summ <- repeated %>%
  group_by(dataset) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1) 

repeated_1h_tab <- repeated_1h_summ %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = "dataset", values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated(
  dt = repeated_1h_tab,
  caption = "Repeated questionnaire participation",
  format = "html",
  file =  "output/tab3_rq4_repeated_participation.html",
  label = "rep-part-1h")
  
## Table 4: Repeated questionnaire participation, professionals vs. non-professionals ####

repeated_1h_summ_prof <- repeated %>%
  group_by(dataset, professional_1) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1)

repeated_1h_prof_tab <- repeated_1h_summ_prof %>%
  filter(!is.na(professional_1)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "professional_1"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_prof(
  dt = repeated_1h_prof_tab,
  caption = "Repeated questionnaire participation, professionals vs. non-professionals (professionals = more than 50 of browsing time to survey sites)",
  format = "html",
  file =  "output/tab4_rq4_repeated_participation_prof.html",
  label = "rep-part-1h-profs")

# SM F.1: ALTERNATIVE TIME CUTOFFS =============================================

## Table F.12: Repeated questionnaire participation (6-hour cutoff) ####

repeated_6h_summ <- repeated %>%
  group_by(dataset) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_6h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_6h),
            perc_repeated_mean = survey_mean(prop_repeated_6h) * 100, 1)

repeated_6h_tab <- repeated_6h_summ %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = "dataset", values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated(
  dt = repeated_6h_tab,
  caption = "Repeated questionnaire participation (6-hour cutoff)",
  format = "html",
  file =  "output/tabF12_rq4_repeated_participation_6h.html", 
  label = "rep-part-6h")

## Table F.13: Repeated questionnaire participation (6-hour cutoff), professionals vs. non-professionals ####

repeated_6h_summ_prof <- repeated %>%
  group_by(dataset, professional_1) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_6h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_6h),
            perc_repeated_mean = survey_mean(prop_repeated_6h) * 100, 1) 

repeated_6h_tab_prof <- repeated_6h_summ_prof %>%
  filter(!is.na(professional_1)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "professional_1"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_prof(
  dt = repeated_6h_tab_prof,
  caption = "Repeated questionnaire participation (6-hour cutoff), professionals vs. non-professionals (professionals = more than 50 of browsing time to survey sites)",
  format = "html",
  file =  "output/tabF13_rq4_repeated_participation_6h_prof.html",
  label = "rep-part-6h-profs")

## Table F.14: Repeated questionnaire participation (24-hour cutoff) ####

repeated_24h_summ <- repeated %>%
  group_by(dataset) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_24h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_24h),
            perc_repeated_mean = survey_mean(prop_repeated_24h) * 100, 1) 

repeated_24h_tab_prof <- repeated_24h_summ %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = "dataset", values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated(
  dt = repeated_24h_tab_prof,
  caption = "Repeated questionnaire participation (24-hour cutoff)",
  format = "html",
  file =  "output/tabF14_rq4_repeated_participation_24h.html",
  label = "rep-part-24h")

## Table F.15: Repeated questionnaire participation (24-hour cutoff), professionals vs. non-professionals ####

repeated_24h_summ_prof <- repeated %>%
  group_by(dataset, professional_1) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_24h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_24h),
            perc_repeated_mean = survey_mean(prop_repeated_24h) * 100, 1) 

repeated_24h_tab_prof <- repeated_24h_summ_prof %>%
  filter(!is.na(professional_1)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "professional_1"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_prof(
  dt = repeated_24h_tab_prof,
  caption = "Repeated questionnaire participation (24-hour cutoff), professionals vs. non-professionals (professionals = more than 50 of browsing time to survey sites)",
  format = "html",
  file =  "output/tabF15_rq4_repeated_participation_24h_prof.html",
  label = "rep-part-24h-profs")

# SM F.2: ALTERNATIVE PROFESSIONALISM INDICATORS ===============================

## Table F.16: Repeated questionnaire participation, professionals vs. non-professionals (pro-fessionals = more than 50 percent visits to survey sites ####

repeated_1h_summ_prof_2 <- repeated %>%
  group_by(dataset, professional_2) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1) 

repeated_1h_tab_prof_2 <- repeated_1h_summ_prof_2 %>%
  filter(!is.na(professional_2)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "professional_2"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_prof(
  dt = repeated_1h_tab_prof_2,
  caption = "Repeated questionnaire participation (1-hour cutoff), professionals vs. non-professionals (professionals = more than 50 percent visits to survey sites)",
  format = "html",
  file =  "output/tabF16_rq4_repeated_participation_prof_2.html",
  label = "rep-part-1h-profs-c")

## Table F.17: Repeated questionnaire participation, professionals vs. non-professionals (pro-fessionals = more than 50 of browsing time to survey sites) ####

repeated_1h_summ_prof_3 <- repeated %>%
  group_by(dataset, professional_3) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1)  

repeated_1h_tab_prof_3 <- repeated_1h_summ_prof_3 %>%
  filter(!is.na(professional_3)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "professional_3"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_prof(
  dt = repeated_1h_tab_prof_3,
  caption = "Repeated questionnaire participation (1-hour cutoff), professionals vs. non-professionals (professionals = more than 50 of browsing time to survey sites)",
  format = "html",
  file =  "output/tabF17_rq4_repeated_participation_prof_3.html",
  label = "rep-part-1h-profs-d")

## Table F.18: Repeated questionnaire participation, professionals vs. non-professionals (pro-fessionals = any of the measures) ####

repeated_1h_summ_prof_any <- repeated %>%
  group_by(dataset, professional_all) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1) 

repeated_1h_tab_prof_any <- repeated_1h_summ_prof_any %>%
  filter(!is.na(professional_all)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "professional_all"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_prof(
  dt = repeated_1h_tab_prof_any,
  caption = "Repeated questionnaire participation (1-hour cutoff), professionals vs. non-professionals (professionals = any of the measures)",
  format = "html",
  file =  "output/tabF18_rq4_repeated_participation_prof_any.html",
  label = "rep-part-1h-profs-any")

# SM F.3 DISAGGREGATION BY QUESTIONNAIRE PLATFORMS =============================

repeated_platform_fb <- read.csv("data/browsing_summarized/platforms_repeated_FB.csv") %>%
  mutate(dataset = "Facebook")
repeated_platform_lu <- read.csv("data/browsing_summarized/platforms_repeated_LU.csv") %>%
  mutate(dataset = "Lucid")
repeated_platform_yg <- read.csv("data/browsing_summarized/platforms_repeated_YG.csv") %>%
  mutate(dataset = "YouGov")

repeated_platform <- bind_rows(
  repeated_platform_fb, repeated_platform_lu, repeated_platform_yg) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = "dataset", values_from = "value")  %>%
  arrange(name)

## Table F.19: Repeated questionnaire participation, by questionnaire platform ####

kable(
  repeated_platform %>% select(-name), 
  caption = "Repeated questionnaire participation (1-hour cutoff), by questionnaire platform",
  format = "html", booktabs = T, escape = F,  
  label = "rep-part-1h-byplatform", linesep = "\\addlinespace",
  col.names = c("", "Facebook", "Lucid", "Yougov"), row.names = F) %>%
  kable_styling(full_width = F, latex_options = c("HOLD_position", "scale_down")) %>%
  pack_rows("Subjects taking at least one questionnaire repeatedly (%)", 1, 11) %>%
  pack_rows("Number of repeated questionnaires per participant (mean)", 12, 22) %>%
  pack_rows("Percent of repeated questionnaires per participants (mean)", 23, 33) %>%
  save_kable(., file = "output/tabF19_rq4_repeated_participation_platforms.html")

# SM F.4 DISAGGREGATION BY HARD-TO-REACH GROUPS ================================

## Table F.20: Repeated questionnaire participation (1-hour cutoff) among hard to reach groups: Age ####

repeated_age <- repeated %>%
  group_by(dataset, age_high) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1) %>%
  filter(!is.na(age_high)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "age_high"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_groups(
  dt = repeated_age,
  caption = "Repeated questionnaire participation (1-hour cutoff) among hard to reach groups (age)", 
  format = "html", file = "output/tabF20_rq4_repeated_participation_groups_age.html", 
  colnames = c("", rep(c("Age >= 65", "Age < 65"), 3)), 
  label = "rep-part-1h-profs-age"
)

## Table F.21: Repeated questionnaire participation (1-hour cutoff) among hard to reach groups: Race ####

repeated_white <- repeated %>%
  group_by(dataset, white) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1) %>%
  filter(!is.na(white)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "white"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_groups(
  dt = repeated_white,
  caption = "Repeated questionnaire participation (1-hour cutoff) among hard to reach groups (race)", 
  format = "html", file = "output/tabF21_rq4_repeated_participation_groups_race.html", 
  colnames = c("", rep(c("White", "Non-White"), 3)), 
  label = "rep-part-1h-profs-race"
)

## Table F.21: Repeated questionnaire participation (1-hour cutoff) among hard to reach groups: Partisanship ####

repeated_party <- repeated %>%
  group_by(dataset, party_bin) %>%
  summarise(any_repeated_perc = survey_mean(any_repeated_1h) * 100,
            count_repeated_urls_mean = survey_mean(count_repeated_urls_1h),
            perc_repeated_mean = survey_mean(prop_repeated_1h) * 100, 1) %>%
  filter(!is.na(party_bin)) %>%
  mutate(across(where(is.numeric), ~ roundr(., 1))) %>%
  mutate(any_repeated_perc = paste0(any_repeated_perc, " (", any_repeated_perc_se, ")"),
         count_repeated_urls_mean = paste0(count_repeated_urls_mean, " (", count_repeated_urls_mean_se, ")"),
         perc_repeated_mean = paste0(perc_repeated_mean, " (", perc_repeated_mean_se, ")")) %>%
  select(-contains("_se"), -`1`) %>%
  pivot_longer(any_repeated_perc:perc_repeated_mean) %>%
  pivot_wider(names_from = c("dataset", "party_bin"), values_from = "value") %>%
  mutate(name = case_when(
    name == "any_repeated_perc" ~ "Subjects taking at least \\\\ one questionnaire repeatedly (\\%)",
    name == "count_repeated_urls_mean" ~ "Number of repeated questionnaires \\\\ per participant (mean)",
    name == "perc_repeated_mean" ~ "Percent of repeated questionnaires \\\\ per participants (mean)"))

export_kable_repeated_groups(
  dt = repeated_party,
  caption = "Repeated questionnaire participation (1-hour cutoff) among hard to reach groups (partisanship)", 
  format = "html", file = "output/tabF22_rq4_repeated_participation_groups_partisanship.html", 
  colnames = c("", rep(c("Democrat", "Republican"), 3)), 
  label = "rep-part-1h-profs-party"
)

# SM F.5 BREAK PATTERNS AS AN ALTERNATIVE EXPLANATION ==========================

## Facebook sample ####

survey_fb_donors <- read.csv("data/surveys_processed/survey_FB_donors.csv")

survey_fb_donors <- survey_fb_donors %>%
  select(person_id, contains("submit")) %>%
  pivot_longer(contains("submit"), names_to = "question", values_to = "duration_s") %>%
  mutate(duration_m = duration_s/60) %>%
  mutate(
    duration_over_1h = ifelse(duration_m > 60, 1, 0),
    duration_over_6h = ifelse(duration_m > 360, 1, 0),
    duration_over_24h = ifelse(duration_m > 1440, 1, 0)) %>%
  group_by(person_id) %>%
  summarize(n_questions = n(),
            n_questions_over_1h = sum(duration_over_1h, na.rm = T),
            n_questions_over_6h = sum(duration_over_6h, na.rm = T),
            n_questions_over_24h = sum(duration_over_24h, na.rm = T)) %>%
  mutate(prop_questions_over_1h = n_questions_over_1h / n_questions,
         prop_questions_over_6h = n_questions_over_6h / n_questions,
         prop_questions_over_24h = n_questions_over_24h / n_questions) %>%
  mutate(any_questions_over_1h = ifelse(n_questions_over_1h > 0, 1, 0),
         any_questions_over_6h = ifelse(n_questions_over_6h > 0, 1, 0),
         any_questions_over_24h = ifelse(n_questions_over_24h > 0, 1, 0))

summary_fb <- survey_fb_donors %>%
  summarize(perc_any_questions_over_1h = mean(any_questions_over_1h)*100,
            perc_any_questions_over_6h = mean(any_questions_over_6h)*100,
            perc_any_questions_over_24h = mean(any_questions_over_24h)*100) %>%
  pivot_longer(perc_any_questions_over_1h:perc_any_questions_over_24h) %>%
  rename("Facebook" = value)

## Kane, Velez, and Barabas (2023): Lucid sample 2021 ####

survey_kane <- read_dta('data/surveys_raw/external/Lucid1_replicationdata.dta')

survey_kane <- survey_kane %>%
  mutate(id = 1:nrow(.)) %>%
  select(id, contains("pagesubmit")) %>%
  pivot_longer(contains("pagesubmit"), names_to = "question", values_to = "duration_s") %>%
  mutate(duration_m = as.numeric(duration_s)/60) %>%
  filter(!is.na(duration_m)) %>%
  mutate(
    duration_over_1h = ifelse(duration_m > 60, 1, 0),
    duration_over_6h = ifelse(duration_m > 360, 1, 0),
    duration_over_24h = ifelse(duration_m > 1440, 1, 0)) %>%
  group_by(id) %>%
  summarize(n_questions = n(),
            n_questions_over_1h = sum(duration_over_1h, na.rm = T),
            n_questions_over_6h = sum(duration_over_6h, na.rm = T),
            n_questions_over_24h = sum(duration_over_24h, na.rm = T)) %>%
  mutate(prop_questions_over_1h = n_questions_over_1h / n_questions,
         prop_questions_over_6h = n_questions_over_6h / n_questions,
         prop_questions_over_24h = n_questions_over_24h / n_questions) %>%
  mutate(any_questions_over_1h = ifelse(n_questions_over_1h > 0, 1, 0),
         any_questions_over_6h = ifelse(n_questions_over_6h > 0, 1, 0),
         any_questions_over_24h = ifelse(n_questions_over_24h > 0, 1, 0))

summary_kane <- survey_kane %>%
  summarize(perc_any_questions_over_1h = mean(any_questions_over_1h)*100,
            perc_any_questions_over_6h = mean(any_questions_over_6h)*100,
            perc_any_questions_over_24h = mean(any_questions_over_24h)*100) %>%
  pivot_longer(perc_any_questions_over_1h:perc_any_questions_over_24h) %>%
  rename("Lucid" = value)

## Clemm von Hohenberg (2023): Dynata sample 2021 ####

survey_clemm_1 <- read.csv('data/surveys_raw/external/Truth+and+bias_30+June+2020_12.47.csv')[-c(1:2),]

survey_clemm_1 <- survey_clemm_1 %>%
  select(ResponseId, contains("Page.Submit")) %>%
  pivot_longer(contains("Page.Submit"), names_to = "question", values_to = "duration_s") %>%
  mutate(duration_m = as.numeric(duration_s)/60) %>%
  filter(!is.na(duration_m)) %>%
  mutate(
    duration_over_1h = ifelse(duration_m > 60, 1, 0),
    duration_over_6h = ifelse(duration_m > 360, 1, 0),
    duration_over_24h = ifelse(duration_m > 1440, 1, 0)) %>%
  group_by(ResponseId) %>%
  summarize(n_questions = n(),
            n_questions_over_1h = sum(duration_over_1h, na.rm = T),
            n_questions_over_6h = sum(duration_over_6h, na.rm = T),
            n_questions_over_24h = sum(duration_over_24h, na.rm = T)) %>%
  mutate(prop_questions_over_1h = n_questions_over_1h / n_questions,
         prop_questions_over_6h = n_questions_over_6h / n_questions,
         prop_questions_over_24h = n_questions_over_24h / n_questions) %>%
  mutate(any_questions_over_1h = ifelse(n_questions_over_1h > 0, 1, 0),
         any_questions_over_6h = ifelse(n_questions_over_6h > 0, 1, 0),
         any_questions_over_24h = ifelse(n_questions_over_24h > 0, 1, 0))

summary_clemm_1 <- survey_clemm_1 %>%
  summarize(perc_any_questions_over_1h = mean(any_questions_over_1h)*100,
            perc_any_questions_over_6h = mean(any_questions_over_6h)*100,
            perc_any_questions_over_24h = mean(any_questions_over_24h)*100) %>%
  pivot_longer(perc_any_questions_over_1h:perc_any_questions_over_24h) %>%
  rename("Dynata" = value)

## Clemm von Hohenberg (2023): Prolific sample 2021 ####

survey_clemm_2 <- read.csv('data/surveys_raw/external/Truth+and+bias+–+Congruence+pre-test+Prolific_13+June+2020_15.20.csv')[-c(1:2),]

survey_clemm_2 <- survey_clemm_2 %>%
  select(ResponseId, contains("Page.Submit")) %>%
  pivot_longer(contains("Page.Submit"), names_to = "question", values_to = "duration_s") %>%
  mutate(duration_m = as.numeric(duration_s)/60) %>%
  filter(!is.na(duration_m)) %>%
  mutate(
    duration_over_1h = ifelse(duration_m > 60, 1, 0),
    duration_over_6h = ifelse(duration_m > 360, 1, 0),
    duration_over_24h = ifelse(duration_m > 1440, 1, 0)) %>%
  group_by(ResponseId) %>%
  summarize(n_questions = n(),
            n_questions_over_1h = sum(duration_over_1h, na.rm = T),
            n_questions_over_6h = sum(duration_over_6h, na.rm = T),
            n_questions_over_24h = sum(duration_over_24h, na.rm = T)) %>%
  mutate(prop_questions_over_1h = n_questions_over_1h / n_questions,
         prop_questions_over_6h = n_questions_over_6h / n_questions,
         prop_questions_over_24h = n_questions_over_24h / n_questions) %>%
  mutate(any_questions_over_1h = ifelse(n_questions_over_1h > 0, 1, 0),
         any_questions_over_6h = ifelse(n_questions_over_6h > 0, 1, 0),
         any_questions_over_24h = ifelse(n_questions_over_24h > 0, 1, 0))

summary_clemm_2 <- survey_clemm_2 %>%
  summarize(perc_any_questions_over_1h = mean(any_questions_over_1h)*100,
            perc_any_questions_over_6h = mean(any_questions_over_6h)*100,
            perc_any_questions_over_24h = mean(any_questions_over_24h)*100) %>%
  pivot_longer(perc_any_questions_over_1h:perc_any_questions_over_24h) %>%
  rename("Prolific" = value)

summary_all <- left_join(
  summary_fb,
  summary_kane) %>%
  left_join(., summary_clemm_1)  %>%
  left_join(., summary_clemm_2) %>%
  mutate(across(where(is.numeric), ~ round(., 2))) %>%
  mutate(name = case_when(
    name == "perc_any_questions_over_1h" ~ "\\% any question > 1h",
    name == "perc_any_questions_over_6h" ~ "\\% any question > 6h",
    name == "perc_any_questions_over_24h" ~ "\\% any question > 24h"))

kable(
  summary_all,
  caption = "Break patterns in Facebook sample and external studies",
  format = "html", booktabs = T, escape = F, linesep = "",
  label = "tab:qualityall",
  col.names = c("", "Facebook", "Lucid 2021", 
                "Dynata 2020", "Prolific 2020")) %>%
  kable_styling(full_width = F, latex_options = c("HOLD_position", "scale_down")) %>%
  column_spec(1, width = "7cm") %>%
  save_kable(., file = "output/tabF23_rq4_break_patterns.html")

# TEXT DESCRIPTIVES ============================================================

# First, the percent of respondents attempting to take at least one questionnaire
# repeatedly is 26.7% for Facebook, 71.5% for Lucid and 15.3% for YouGov

repeated_1h_summ %>% select(dataset, any_repeated_perc)

# The table shows an average of 1.1 and 0.5 for the Facebook and YouGov samples, 
# but 8.3 for the Lucid sample.

repeated_1h_summ %>% select(dataset, count_repeated_urls_mean)

# The table shows that the average percentages of repeated participation range 
# from 2.2% (YouGov) to 5.0% (Facebook) and 6.9% (Lucid)

repeated_1h_summ %>% select(dataset, perc_repeated_mean)

# For example, in the Facebook sample, 74.9\% of professionals took at least one 
# questionnaire more than once, whereas this is the case for only 21.4\% of 
# non-professionals. For Lucid, these numbers are 85.4\% versus 45.0\%; 
# for YouGov, 69.5\% versus 8.1\%.

repeated_1h_summ_prof %>% select(dataset, professional_1, any_repeated_perc)

# For example, when using the 24-hour cutoff, the share of subjects taking at 
# least one questionnaire more than once is still 25.8\%, 73.8\% and 15.1\% 
# for Facebook, Lucid and YouGov respectively

repeated_24h_summ %>% select(dataset, any_repeated_perc)



